{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# create entry points to spark\n",
    "try:\n",
    "    sc.stop()\n",
    "except:\n",
    "    pass\n",
    "from pyspark import SparkContext, SparkConf\n",
    "from pyspark.sql import SparkSession\n",
    "sc=SparkContext()\n",
    "spark = SparkSession(sparkContext=sc)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Example data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+\n",
      "|            model| mpg|cyl| disp| hp|drat|   wt| qsec| vs| am|gear|carb|\n",
      "+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+\n",
      "|        Mazda RX4|21.0|  6|160.0|110| 3.9| 2.62|16.46|  0|  1|   4|   4|\n",
      "|    Mazda RX4 Wag|21.0|  6|160.0|110| 3.9|2.875|17.02|  0|  1|   4|   4|\n",
      "|       Datsun 710|22.8|  4|108.0| 93|3.85| 2.32|18.61|  1|  1|   4|   1|\n",
      "|   Hornet 4 Drive|21.4|  6|258.0|110|3.08|3.215|19.44|  1|  0|   3|   1|\n",
      "|Hornet Sportabout|18.7|  8|360.0|175|3.15| 3.44|17.02|  0|  0|   3|   2|\n",
      "+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "mtcars = spark.read.csv('../../data/mtcars.csv', inferSchema=True, header=True)\n",
    "# correct first column name\n",
    "mtcars = mtcars.withColumnRenamed('_c0', 'model')\n",
    "mtcars.show(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Select columns by name\n",
    "\n",
    "We can simply use the **select()** function to select columns by name."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+-----+\n",
      "| hp| disp|\n",
      "+---+-----+\n",
      "|110|160.0|\n",
      "|110|160.0|\n",
      "| 93|108.0|\n",
      "|110|258.0|\n",
      "|175|360.0|\n",
      "+---+-----+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "mtcars.select(['hp', 'disp']).show(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Select columns by index\n",
    "\n",
    "We can convert indices to corresponding column names and then select columns by name."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['model', 'disp', 'hp', 'qsec']"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "indices = [0,3,4,7]\n",
    "selected_columns =  [mtcars.columns[index] for index in indices]\n",
    "selected_columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-----------------+-----+---+-----+\n",
      "|            model| disp| hp| qsec|\n",
      "+-----------------+-----+---+-----+\n",
      "|        Mazda RX4|160.0|110|16.46|\n",
      "|    Mazda RX4 Wag|160.0|110|17.02|\n",
      "|       Datsun 710|108.0| 93|18.61|\n",
      "|   Hornet 4 Drive|258.0|110|19.44|\n",
      "|Hornet Sportabout|360.0|175|17.02|\n",
      "+-----------------+-----+---+-----+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "mtcars.select(selected_columns).show(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Select columns by pattern\n",
    "\n",
    "Example: columns start with 'd'."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['disp', 'drat']"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import re\n",
    "selected_columns = [x for x in mtcars.columns if re.compile('^d').match(x) is not None]\n",
    "selected_columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-----+----+\n",
      "| disp|drat|\n",
      "+-----+----+\n",
      "|160.0| 3.9|\n",
      "|160.0| 3.9|\n",
      "|108.0|3.85|\n",
      "|258.0|3.08|\n",
      "|360.0|3.15|\n",
      "+-----+----+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "mtcars.select(selected_columns).show(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
